Loading datasets and shuffling¶

In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import cv2
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.utils import shuffle

train_df = pd.read_csv("emnist-letters-train.csv")
test_df = pd.read_csv("emnist-letters-test.csv")

train_df.columns = [i for i in range(785)]
test_df.columns = [i for i in range(785)]

print(train_df.shape)
print(train_df.info())
print(test_df.shape)
print(test_df.info())
dataset = pd.concat([train_df, test_df], axis=0)
print(dataset.shape)
print(dataset.info())
dataset = pd.DataFrame(shuffle(dataset))
print(dataset.shape)
(88799, 785)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88799 entries, 0 to 88798
Columns: 785 entries, 0 to 784
dtypes: int64(785)
memory usage: 531.8 MB
None
(14799, 785)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14799 entries, 0 to 14798
Columns: 785 entries, 0 to 784
dtypes: int64(785)
memory usage: 88.6 MB
None
(103598, 785)
<class 'pandas.core.frame.DataFrame'>
Index: 103598 entries, 0 to 14798
Columns: 785 entries, 0 to 784
dtypes: int64(785)
memory usage: 621.2 MB
None
(103598, 785)

Splitting the dataset into features and labels¶

In [ ]:
labels = dataset.iloc[:, 0]
print(labels.shape)

features = dataset.iloc[:, 1:]
print(features.shape)
(103598,)
(103598, 784)

Building label map¶

In [ ]:
mp = dict()
for i in range(0, 26):
    mp[i+1] = chr(ord('a') + i)
print(mp)
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}

Preprocessing images so all have same orientation and shuffling¶

In [ ]:
features = np.array(features)
print(features.shape)
features = features / 255.0

features_image = features.reshape(-1, 28, 28, 1) # adjust on the number of rows, each image is grey 28 * 28
features_image = np.rot90(features_image, axes=(1, 2))
features_image = np.flip(features_image, axis=1)
(103598, 784)

Drawing images function¶

In [ ]:
def draw_images(images, row_count, column_count):
    fig, axs = plt.subplots(row_count, column_count, figsize=(10,10))
    for i in range(row_count):
        for j in range(column_count):
            axs[i,j].imshow(images[i * column_count + j], cmap="gray")
            axs[i,j].axis('off')
    plt.show()

indices = np.random.choice(features_image.shape[0], 300) # choose from 0 to 103598 (shape[0]) 300 element
draw_images(features_image[indices].squeeze(), 15, 15)
No description has been provided for this image

train_test_split And fitting model using params from merged¶

In [ ]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.8,
                                                     test_size=0.2, stratify=labels, random_state=42)

classifier = DecisionTreeClassifier(ccp_alpha= 0.0, class_weight= None, criterion='entropy',
            max_depth=20, max_features= None, max_leaf_nodes= None,
            min_impurity_decrease= 0.0, min_samples_leaf= 4, min_samples_split= 10,
              min_weight_fraction_leaf= 0.0, random_state= 42, splitter= 'best')

classifier.fit(X_train, y_train)
joblib.dump(classifier, 'best_DecisonTree_model_merged.joblib')

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusionMatrix = confusion_matrix(y_test, y_pred)
print("classifier.score(X_test, y_test): ", classifier.score(X_test, y_test))
print(f'Accuracy of the Best Model: {accuracy}')
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Confusion Matrix: \n", confusionMatrix)
classifier.score(X_test, y_test):  0.7071911196911197
Accuracy of the Best Model: 0.7071911196911197
Classification Report: 
               precision    recall  f1-score   support

           1       0.57      0.60      0.58       839
           2       0.67      0.70      0.68       839
           3       0.79      0.83      0.81       844
           4       0.66      0.68      0.67       840
           5       0.75      0.76      0.76       848
           6       0.66      0.70      0.68       839
           7       0.50      0.49      0.50       837
           8       0.68      0.71      0.69       845
           9       0.60      0.63      0.61       846
          10       0.76      0.78      0.77       840
          11       0.69      0.69      0.69       848
          12       0.63      0.62      0.62       843
          13       0.81      0.85      0.83       840
          14       0.69      0.72      0.70       833
          15       0.83      0.82      0.82       842
          16       0.76      0.79      0.77       846
          17       0.56      0.48      0.51       847
          18       0.73      0.69      0.71       844
          19       0.81      0.79      0.80       758
          20       0.65      0.66      0.66       687
          21       0.75      0.75      0.75       684
          22       0.78      0.79      0.79       684
          23       0.82      0.74      0.78       684
          24       0.75      0.69      0.72       687
          25       0.75      0.71      0.73       691
          26       0.82      0.75      0.78       685

    accuracy                           0.71     20720
   macro avg       0.71      0.71      0.71     20720
weighted avg       0.71      0.71      0.71     20720

Confusion Matrix: 
 [[507  18  14  23   6  11  30  33   1   0  13   2  18  42  17   7  29  20
    7   3  18   2  11   4   0   3]
 [ 17 587  11  38  20  11  30  26   5   4  11   5   1   7   8   7  14   5
   14   3   3   0   0   1   0  11]
 [ 11   9 702   1  32   6  10   1   1   2  11   7   3   2   8   7   9   4
    1   8   2   1   0   0   2   4]
 [ 18  49   2 575   1  11  14  18   8  19  10   3   1   8  35   4   7   3
    8   4  14   4   3   5   5  11]
 [ 14  15  52   4 646  14  12   3   2   0  12   2   4   4   2   8   5  15
    6  17   1   1   1   1   0   7]
 [ 12   5   5   4  18 584  16   4   9   4  10   6   7   7   1  60  11  15
   10  28   0   3   3   9   4   4]
 [ 34  31  11  12  22  25 411   4   3  24   6   5   8   5   6  11 132   5
   36  10   7   4   4   6   9   6]
 [ 31  23   0  11   4  10   3 601   6   3  25  17  23  27   3   1   5   7
    2   8  14   2   3  14   1   1]
 [  2   3   0  14   2  13   7   4 534  29   4 185   0   0   1   6   6   4
    3   7   0   1   0   2   8  11]
 [  3  12   4  27   2   4   8   2  25 657   1  13   1   2   3   3   5   0
   14  24   4   3   4   4   9   6]
 [  6   6  13  14  12  21   4  31   1   0 588   3   9  18   0   4   2  36
    1  11  15   6   1  36   3   7]
 [  2   5   6   8   0   7   8  15 229   5   7 522   0   1   0   0   2   1
    1   9   1   1   0   1   3   9]
 [ 15   9   0   1   3   3   3  19   1   2   8   0 711  31   2   2   1   0
    0   2   6   3  15   3   0   0]
 [ 30   3   2   8   2   2   7  42   0   2  13   1  36 600   8   2   6  10
    1   4   5   6  25   8   7   3]
 [ 20  13  11  42   8   3  15   1   0   5   0   1   5   6 687   1  11   1
    5   1   1   0   4   0   0   1]
 [  5   1   0   8   4  60   9   2   7   2   3   7   0   2   4 665  18  10
    0  18   1   7   1   0   9   3]
 [ 63  31  14  13  15  18 129   7   6   7   6   1   4  11  22  27 403  11
   12  15  10   5   3   4   8   2]
 [ 34   3  17   1  23  17   6   8   3   2  31   4   5  10   1  22  10 582
    2  24   1  14   3   7  11   3]
 [  7  14   2   6   3   8  33   1   5  40   4   0   1   2   4   0   8   3
  600   3   0   1   0   2   9   2]
 [  4   9   9  10  15  30   8   6  18  19   7  12   8   1   0  19   6  20
    3 453   0   1   4   1  17   7]
 [ 19   5   0  14   0   0   9   9   0  11  12   2   4  20  11   1   6   4
    1   1 514  33   4   2   2   0]
 [  3   4   2   8   3   2   3   2   2   4   2   3   2  11   3   2   2   8
    0   5  36 538  10   4  25   0]
 [ 10   3   6  10   1   4   6  14   1   4   4   1  14  40   3   1   4   2
    4   3  23   7 509   2   5   3]
 [ 13   0   0  11   8   9   8  29   6   2  48   5   4   4   0   0   3  12
    5   4   1   8   1 475  26   5]
 [  3   5   0   7   0   5  16   4   6  12   4   9   3   5   0   9  14  11
    4  16   5  33   5  24 491   0]
 [ 12  13   8   5  13   6  11   2  12   9  13  12   1   4   0   2   7   5
    4  12   1   2   3  15   2 511]]

Drawing ROC Curve¶

In [ ]:
from sklearn.preprocessing import LabelBinarizer
y_score = classifier.predict_proba(X_test)
print("Train labels: ")
print(pd.DataFrame(y_train).value_counts())
print("----------------------------------------------------------------------------")
print(pd.DataFrame(y_score).value_counts())
print("y_score_proba: shape ", y_score.shape)
print("----------------------------------------------------------------------------")
print("Pred labels: ")
print(pd.DataFrame(y_pred).value_counts())
print("----------------------------------------------------------------------------")
print("test_labels: ")
print(pd.DataFrame(y_test).value_counts())
print("----------------------------------------------------------------------------")
print("train_labels: ")
print(pd.DataFrame(y_train).value_counts())
print("----------------------------------------------------------------------------")
print("train_labels: ")
print(np.array(y_train.shape))

label_binarizer = LabelBinarizer()
y_bin = label_binarizer.fit_transform(y_test)

print(y_score.shape, y_bin.shape, y_bin[0], y_test.shape)
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = 26
for i in range(1, n_classes + 1):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i-1], y_score[:, i-1])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves
plt.figure(figsize=(10, 8))
for i in range(1, n_classes + 1):
    plt.plot(fpr[i], tpr[i], label=f'Class {mp[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line for random classifier
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Each Class (One-vs-All)')
plt.legend(loc='lower right')
plt.show()
Train labels: 
11    3390
5     3389
17    3388
16    3384
9     3382
8     3379
18    3375
3     3375
12    3372
15    3366
10    3362
13    3362
4     3358
2     3357
1     3356
6     3355
7     3348
14    3332
19    3034
25    2762
24    2750
20    2749
26    2742
22    2738
23    2738
21    2735
Name: count, dtype: int64
----------------------------------------------------------------------------
0    1         2    3    4    5         6    7    8     9         10   11   12   13   14   15   16        17   18   19        20   21        22        23     24        25      
0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.00  0.000000  0.0  0.0  1.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.000  0.000000  0.000000    713
               1.0  0.0  0.0  0.000000  0.0  0.0  0.00  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.000  0.000000  0.000000    712
               0.0  0.0  0.0  0.000000  0.0  0.0  0.00  0.000000  0.0  0.0  0.0  0.0  0.0  1.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.000  0.000000  0.000000    684
                                                        1.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.000  0.000000  0.000000    660
                                             1.0  0.00  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.000  0.000000  0.000000    649
                                                                                                                                                                                   ... 
                              0.166667  0.0  0.0  0.00  0.166667  0.0  0.0  0.0  0.0  0.0  0.0  0.333333  0.0  0.0  0.000000  0.0  0.000000  0.166667  0.000  0.000000  0.166667      1
     0.166667  0.0  0.0  0.0  0.000000  0.0  0.0  0.00  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.166667  0.0  0.0  0.333333  0.0  0.000000  0.000000  0.000  0.333333  0.000000      1
     0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.25  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.125  0.000000  0.625000      1
                                                                                                                                                       0.000  0.750000  0.000000      1
     0.166667  0.0  0.0  0.0  0.000000  0.0  0.0  0.00  0.166667  0.0  0.0  0.0  0.0  0.0  0.0  0.500000  0.0  0.0  0.000000  0.0  0.166667  0.000000  0.000  0.000000  0.000000      1
Name: count, Length: 2374, dtype: int64
y_score_proba: shape  (20720, 26)
----------------------------------------------------------------------------
Pred labels: 
1     895
3     891
9     891
8     888
6     884
2     876
4     875
13    873
16    871
14    870
10    868
5     863
11    853
15    829
12    828
7     816
18    794
19    744
17    726
20    693
22    686
21    683
25    656
24    630
26    620
23    617
Name: count, dtype: int64
----------------------------------------------------------------------------
test_labels: 
5     848
11    848
17    847
16    846
9     846
8     845
18    844
3     844
12    843
15    842
10    840
4     840
13    840
6     839
2     839
1     839
7     837
14    833
19    758
25    691
20    687
24    687
26    685
21    684
22    684
23    684
Name: count, dtype: int64
----------------------------------------------------------------------------
train_labels: 
11    3390
5     3389
17    3388
16    3384
9     3382
8     3379
18    3375
3     3375
12    3372
15    3366
10    3362
13    3362
4     3358
2     3357
1     3356
6     3355
7     3348
14    3332
19    3034
25    2762
24    2750
20    2749
26    2742
22    2738
23    2738
21    2735
Name: count, dtype: int64
----------------------------------------------------------------------------
train_labels: 
[82878]
(20720, 26) (20720, 26) [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] (20720,)
No description has been provided for this image

Confusion Matrix¶

In [ ]:
import seaborn as sns

ls = []
for i in range(n_classes):
    ls.append(mp[i+1])

plt.figure(figsize=(12, 10))

sns.heatmap(confusionMatrix, annot=True, fmt='d', cmap='Blues', xticklabels=ls, yticklabels=ls)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
No description has been provided for this image

Plotting the Decision Tree¶

In [ ]:
from sklearn.tree import plot_tree

ls = []
for i in range(26):
    ls.append(mp[i+1])

plt.figure(figsize=(64, 256))
plot_tree(classifier, filled=True, class_names=ls)
plt.show()
No description has been provided for this image

Predict the output¶

In [ ]:
myRow = dataset.iloc[6, :] # enter row number
label = myRow.iloc[0]
features = myRow[1:]
features = np.array(features)
print(features.shape)
features_image = features.reshape((1,28,28,1))
features_image = np.rot90(features_image, axes=(1, 2))
features_image = np.flip(features_image, axis=1)
features_image = features_image.reshape((28,28,1))

print(mp[label])
print(features_image.shape)
plt.imshow(features_image, cmap='gray')
plt.title('Image Title')
plt.axis('off')  # Optional: Turn off axis labels
plt.show()
classifier = joblib.load('best_DecisonTree_model_merged.joblib')
y_pred = classifier.predict(features.reshape(1, -1))
print(mp[y_pred[0]])
print(y_pred[0] == label)

# draw_images(features_image.squeeze(), 1, 1)
(784,)
y
(28, 28, 1)
No description has been provided for this image
y
True